{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Self join\n",
    "\n",
    "## Edinburgh Buses\n",
    "[Details of the database](https://sqlzoo.net/wiki/Edinburgh_Buses.) Looking at the data\n",
    "\n",
    "```\n",
    "stops(id, name)\n",
    "route(num, company, pos, stop)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading spark-stubs, spark-hive\n",
      "Adding Hive conf dir /opt/hive/conf to classpath\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SLF4J: No SLF4J providers were found.\n",
      "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
      "SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a target=\"_blank\" href=\"http://jupyter:4040\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@785ebcfa"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:3.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark._\n",
    "import org.apache.spark.sql._\n",
    "import org.apache.spark.sql.functions._\n",
    "\n",
    "val spark = {\n",
    "    NotebookSparkSession.builder()\n",
    "    .progress(false)\n",
    "    .appName(\"app09\")\n",
    "    // .master(\"spark://192.168.31.31:7077\")\n",
    "    .master(\"local[*]\")\n",
    "    .config(\"spark.sql.warehouse.dir\", \n",
    "            \"hdfs://192.168.31.31:9000/user/hive/warehouse\") \n",
    "    .config(\"spark.cores.max\", \"4\") \n",
    "    .config(\"spark.executor.instances\", \"1\") \n",
    "    .config(\"spark.executor.cores\", \"2\") \n",
    "    .config(\"spark.executor.memory\", \"10g\") \n",
    "    .config(\"spark.shuffle.service.enabled\", \"false\") \n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\") \n",
    "    .config(\"spark.sql.catalogImplementation\", \"hive\")\n",
    "    .config(\"spark.sql.repl.eagerEval.enabled\", \"true\")\n",
    "    .config(\"spark.driver.allowMultipleContexts\", \"true\")\n",
    "    .getOrCreate()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\u001b[39m\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m\n",
       "\u001b[36mhiveCxt\u001b[39m: \u001b[32msql\u001b[39m.\u001b[32mhive\u001b[39m.\u001b[32mHiveContext\u001b[39m = org.apache.spark.sql.hive.HiveContext@5dbfb5f"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "def sc = spark.sparkContext\n",
    "val hiveCxt = new org.apache.spark.sql.hive.HiveContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mRichDF\u001b[39m"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Credit to Aivean\n",
    "implicit class RichDF(val ds:DataFrame) {\n",
    "    def showHTML(limit: Int = 50, truncate: Int = 100) = {\n",
    "        import xml.Utility.escape\n",
    "        val data = ds.take(limit)\n",
    "        val header = ds.schema.fieldNames.toSeq        \n",
    "        val rows: Seq[Seq[String]] = data.map { row =>\n",
    "          row.toSeq.map {cell =>\n",
    "            val str = cell match {\n",
    "              case null => \"null\"\n",
    "              case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "              case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "              case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "              case _ => cell.toString\n",
    "            }\n",
    "            if (truncate > 0 && str.length > truncate) {\n",
    "              // do not show ellipses for strings shorter than 4 characters.\n",
    "              if (truncate < 4) str.substring(0, truncate)\n",
    "              else str.substring(0, truncate - 3) + \"...\"\n",
    "            } else {\n",
    "              str\n",
    "            }\n",
    "          }: Seq[String]\n",
    "        }\n",
    "    publish.html(s\"\"\" <table>\n",
    "                <tr>\n",
    "                 ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "                </tr>\n",
    "                ${rows.map {row =>\n",
    "                  s\"<tr>${row.map{c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "                }.mkString}\n",
    "            </table>\n",
    "        \"\"\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[36mstops\u001b[39m: \u001b[32mDataFrame\u001b[39m = [id: int, name: string]\n",
       "\u001b[36mroute\u001b[39m: \u001b[32mDataFrame\u001b[39m = [num: string, company: string ... 2 more fields]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val stops = hiveCxt.table(\"sqlzoo.stops\")\n",
    "val route = hiveCxt.table(\"sqlzoo.route\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.\n",
    "How many **stops** are in the database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>count(id)</th>\n",
       "                </tr>\n",
       "                <tr><td>246</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "stops.agg(count(\"id\")).showHTML()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.\n",
    "Find the **id** value for the stop 'Craiglockhart'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>id</th>\n",
       "                </tr>\n",
       "                <tr><td>53</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "stops.filter(stops(\"name\")===\"Craiglockhart\").select(\"id\").showHTML()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.\n",
    "Give the **id** and the **name** for the **stops** on the '4' 'LRT' service."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>id</th><th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>19</td><td>Bingham</td></tr><tr><td>53</td><td>Craiglockhart</td></tr><tr><td>85</td><td>Fairmilehead</td></tr><tr><td>115</td><td>Haymarket</td></tr><tr><td>117</td><td>Hillend</td></tr><tr><td>149</td><td>London Road</td></tr><tr><td>177</td><td>Northfield</td></tr><tr><td>179</td><td>Oxgangs</td></tr><tr><td>194</td><td>Princes Street</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(stops.join(route, stops(\"id\")===route(\"stop\"), joinType=\"left\")\n",
    " .filter((col(\"num\")===\"4\") && (col(\"company\")===\"LRT\"))\n",
    " .select(\"id\", \"name\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Routes and stops\n",
    "\n",
    "The query shown gives the number of routes that visit either London Road (149) or Craiglockhart (53). Run the query and notice the two services that link these stops have a count of 2. Add a HAVING clause to restrict the output to these two routes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>company</th><th>num</th><th>count(stop)</th>\n",
       "                </tr>\n",
       "                <tr><td>LRT</td><td>45</td><td>2</td></tr><tr><td>LRT</td><td>4</td><td>2</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(route.filter(route(\"stop\").isin(List(149, 53): _*))\n",
    "    .groupBy(\"company\", \"num\")\n",
    "    .agg(count(\"stop\"))\n",
    "    .filter($\"count(stop)\"===2)\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.\n",
    "Execute the self join shown and observe that b.stop gives all the places you can get to from Craiglockhart, without changing routes. Change the query so that it shows the services from Craiglockhart to London Road."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>company</th><th>num</th><th>stop</th><th>stop2</th>\n",
       "                </tr>\n",
       "                <tr><td>LRT</td><td>4</td><td>53</td><td>149</td></tr><tr><td>LRT</td><td>45</td><td>53</td><td>149</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(route.filter(col(\"stop\")===53)\n",
    " .join(route\n",
    "       .filter(col(\"stop\")===149)\n",
    "       .withColumnsRenamed(Map(\"stop\" -> \"stop2\")),\n",
    "      Seq(\"company\", \"num\"))\n",
    " .select(\"company\", \"num\", \"stop\", \"stop2\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.\n",
    "The query shown is similar to the previous one, however by joining two copies of the **stops** table we can refer to **stops** by **name** rather than by number. Change the query so that the services between 'Craiglockhart' and 'London Road' are shown. If you are tired of these places try 'Fairmilehead' against 'Tollcross'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>company</th><th>num</th><th>name</th><th>name2</th>\n",
       "                </tr>\n",
       "                <tr><td>LRT</td><td>4</td><td>Craiglockhart</td><td>London Road</td></tr><tr><td>LRT</td><td>45</td><td>Craiglockhart</td><td>London Road</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(route.join(route\n",
    "            .withColumnsRenamed(Map(\n",
    "                \"pos\" -> \"pos2\", \"stop\" -> \"stop2\")), \n",
    "            Seq(\"company\", \"num\"))\n",
    "    .join(stops, col(\"stop\")===stops(\"id\"))\n",
    "    .join(stops\n",
    "          .withColumnsRenamed(Map(\n",
    "              \"id\" -> \"id2\", \"name\" -> \"name2\")), \n",
    "          col(\"stop2\")===col(\"id2\"))\n",
    "    .filter((col(\"name\")===\"Craiglockhart\") && \n",
    "            (col(\"name2\")===\"London Road\"))\n",
    "    .select(\"company\", \"num\", \"name\", \"name2\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. [Using a self join](https://sqlzoo.net/wiki/Using_a_self_join)\n",
    "\n",
    "Give a list of all the services which connect stops 115 and 137 ('Haymarket' and 'Leith')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>company</th><th>num</th>\n",
       "                </tr>\n",
       "                <tr><td>LRT</td><td>2A</td></tr><tr><td>LRT</td><td>2</td></tr><tr><td>LRT</td><td>25</td></tr><tr><td>SMT</td><td>C5</td></tr><tr><td>LRT</td><td>12</td></tr><tr><td>LRT</td><td>22</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(route.join(route\n",
    "            .withColumnsRenamed(Map(\n",
    "                \"pos\" -> \"pos2\", \"stop\" -> \"stop2\")),\n",
    "            Seq(\"company\", \"num\"))\n",
    "    .filter((col(\"stop\")===115) && (col(\"stop2\")===137))\n",
    "    .select(\"company\", \"num\")\n",
    "    .distinct()\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8.\n",
    "Give a list of the services which connect the stops 'Craiglockhart' and 'Tollcross'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>company</th><th>num</th>\n",
       "                </tr>\n",
       "                <tr><td>LRT</td><td>10</td></tr><tr><td>LRT</td><td>27</td></tr><tr><td>LRT</td><td>45</td></tr><tr><td>LRT</td><td>47</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(route.join(route\n",
    "            .withColumnsRenamed(Map(\n",
    "                \"pos\" -> \"pos2\", \"stop\" -> \"stop2\")),\n",
    "            Seq(\"company\", \"num\"))\n",
    "    .join(stops, col(\"stop\")===col(\"id\"))\n",
    "    .join(stops\n",
    "          .withColumnsRenamed(Map(\n",
    "              \"id\" -> \"id2\", \"name\" -> \"name2\")), \n",
    "          col(\"stop2\")===col(\"id2\"))\n",
    "    .filter((col(\"name\")===\"Craiglockhart\") && \n",
    "            (col(\"name2\")===\"Tollcross\"))\n",
    "    .select(\"company\", \"num\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9.\n",
    "Give a distinct list of the **stops** which may be reached from 'Craiglockhart' by taking one bus, including 'Craiglockhart' itself, offered by the LRT company. Include the company and bus no. of the relevant services."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name2</th><th>company</th><th>num</th>\n",
       "                </tr>\n",
       "                <tr><td>Tollcross</td><td>LRT</td><td>27</td></tr><tr><td>Duddingston</td><td>LRT</td><td>45</td></tr><tr><td>Balerno Church</td><td>LRT</td><td>47</td></tr><tr><td>Craiglockhart</td><td>LRT</td><td>10</td></tr><tr><td>Hillend</td><td>LRT</td><td>4</td></tr><tr><td>Tollcross</td><td>LRT</td><td>47</td></tr><tr><td>Tollcross</td><td>LRT</td><td>45</td></tr><tr><td>Riccarton Campus</td><td>LRT</td><td>45</td></tr><tr><td>Princes Street</td><td>LRT</td><td>4</td></tr><tr><td>Oxgangs</td><td>LRT</td><td>27</td></tr><tr><td>Silverknowes</td><td>LRT</td><td>10</td></tr><tr><td>Balerno</td><td>LRT</td><td>47</td></tr><tr><td>Cockburn Crescent</td><td>LRT</td><td>47</td></tr><tr><td>London Road</td><td>LRT</td><td>4</td></tr><tr><td>Fairmilehead</td><td>LRT</td><td>4</td></tr><tr><td>Craiglockhart</td><td>LRT</td><td>45</td></tr><tr><td>London Road</td><td>LRT</td><td>45</td></tr><tr><td>Oxgangs</td><td>LRT</td><td>4</td></tr><tr><td>Crewe Toll</td><td>LRT</td><td>27</td></tr><tr><td>Hanover Street</td><td>LRT</td><td>47</td></tr><tr><td>Colinton</td><td>LRT</td><td>45</td></tr><tr><td>Canonmills</td><td>LRT</td><td>47</td></tr><tr><td>Tollcross</td><td>LRT</td><td>10</td></tr><tr><td>Brunstane</td><td>LRT</td><td>45</td></tr><tr><td>Craiglockhart</td><td>LRT</td><td>27</td></tr><tr><td>Newhaven</td><td>LRT</td><td>10</td></tr><tr><td>Leith</td><td>LRT</td><td>10</td></tr><tr><td>Colinton</td><td>LRT</td><td>47</td></tr><tr><td>Craiglockhart</td><td>LRT</td><td>4</td></tr><tr><td>Princes Street</td><td>LRT</td><td>10</td></tr><tr><td>Northfield</td><td>LRT</td><td>45</td></tr><tr><td>Haymarket</td><td>LRT</td><td>4</td></tr><tr><td>Silverknowes</td><td>LRT</td><td>27</td></tr><tr><td>Bingham</td><td>LRT</td><td>4</td></tr><tr><td>Northfield</td><td>LRT</td><td>4</td></tr><tr><td>Hanover Street</td><td>LRT</td><td>45</td></tr><tr><td>Torphin</td><td>LRT</td><td>10</td></tr><tr><td>Leith Walk</td><td>LRT</td><td>10</td></tr><tr><td>Currie</td><td>LRT</td><td>45</td></tr><tr><td>Currie</td><td>LRT</td><td>47</td></tr><tr><td>Colinton</td><td>LRT</td><td>10</td></tr><tr><td>Craiglockhart</td><td>LRT</td><td>47</td></tr><tr><td>Hunters Tryst</td><td>LRT</td><td>27</td></tr><tr><td>Hanover Street</td><td>LRT</td><td>27</td></tr><tr><td>Canonmills</td><td>LRT</td><td>27</td></tr><tr><td>Muirhouse</td><td>LRT</td><td>10</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(route.join(route\n",
    "            .withColumnsRenamed(Map(\n",
    "                \"pos\" -> \"pos2\", \"stop\" -> \"stop2\")),\n",
    "            Seq(\"company\", \"num\"))\n",
    "    .join(stops, col(\"stop\")===col(\"id\"))\n",
    "    .join(stops\n",
    "          .withColumnsRenamed(Map(\n",
    "              \"id\" -> \"id2\", \"name\" -> \"name2\")), \n",
    "          col(\"stop2\")===col(\"id2\"))\n",
    "    .filter((col(\"name\")===\"Craiglockhart\") && \n",
    "            (col(\"company\")===\"LRT\"))\n",
    "    .select(\"name2\", \"company\", \"num\")\n",
    "    .dropDuplicates()\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10.\n",
    "Find the routes involving two buses that can go from **Craiglockhart** to **Lochend**.\n",
    "Show the bus no. and company for the first bus, the name of the stop for the transfer,\n",
    "and the bus no. and company for the second bus.\n",
    "\n",
    "> _Hint_    \n",
    "> Self-join twice to find buses that visit Craiglockhart and Lochend, then join those on matching stops."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>num</th><th>company</th><th>name2</th><th>num2</th><th>company2</th>\n",
       "                </tr>\n",
       "                <tr><td>45</td><td>LRT</td><td>Riccarton Campus</td><td>65</td><td>LRT</td></tr><tr><td>10</td><td>LRT</td><td>Leith</td><td>C5</td><td>SMT</td></tr><tr><td>10</td><td>LRT</td><td>Leith</td><td>34</td><td>LRT</td></tr><tr><td>10</td><td>LRT</td><td>Leith</td><td>87</td><td>LRT</td></tr><tr><td>10</td><td>LRT</td><td>Leith</td><td>35</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>Duddingston</td><td>42</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>Duddingston</td><td>46A</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>Princes Street</td><td>C5</td><td>SMT</td></tr><tr><td>4</td><td>LRT</td><td>Princes Street</td><td>65</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>Haymarket</td><td>65</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>Haymarket</td><td>C5</td><td>SMT</td></tr><tr><td>27</td><td>LRT</td><td>Crewe Toll</td><td>20</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>46A</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>20</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>65</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>34</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>C5</td><td>SMT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>87A</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>35</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>87</td><td>LRT</td></tr><tr><td>45</td><td>LRT</td><td>London Road</td><td>42</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>46A</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>20</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>65</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>34</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>C5</td><td>SMT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>87A</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>35</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>87</td><td>LRT</td></tr><tr><td>4</td><td>LRT</td><td>London Road</td><td>42</td><td>LRT</td></tr><tr><td>27</td><td>LRT</td><td>Canonmills</td><td>34</td><td>LRT</td></tr><tr><td>27</td><td>LRT</td><td>Canonmills</td><td>35</td><td>LRT</td></tr><tr><td>47</td><td>LRT</td><td>Canonmills</td><td>34</td><td>LRT</td></tr><tr><td>47</td><td>LRT</td><td>Canonmills</td><td>35</td><td>LRT</td></tr><tr><td>10</td><td>LRT</td><td>Princes Street</td><td>C5</td><td>SMT</td></tr><tr><td>10</td><td>LRT</td><td>Princes Street</td><td>65</td><td>LRT</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[36mbus1\u001b[39m: \u001b[32mDataset\u001b[39m[\u001b[32mRow\u001b[39m] = [name2: string, company: string ... 2 more fields]\n",
       "\u001b[36mbus2\u001b[39m: \u001b[32mDataset\u001b[39m[\u001b[32mRow\u001b[39m] = [stop: int, company: string ... 1 more field]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val bus1 = (route.join(route\n",
    "            .withColumnsRenamed(Map(\n",
    "                \"pos\" -> \"pos2\", \"stop\" -> \"stop2\")),\n",
    "            Seq(\"company\", \"num\"))\n",
    "    .join(stops, col(\"stop\")===col(\"id\"))\n",
    "    .join(stops\n",
    "          .withColumnsRenamed(Map(\n",
    "              \"id\" -> \"id2\", \"name\" -> \"name2\")), \n",
    "          col(\"stop2\")===col(\"id2\"))\n",
    "    .filter(col(\"name\")===\"Craiglockhart\")\n",
    "    .select(\"name2\", \"company\", \"num\", \"stop2\")\n",
    "    .dropDuplicates())\n",
    "val bus2 = (route.join(route\n",
    "            .withColumnsRenamed(\n",
    "                Map(\"pos\" -> \"pos2\", \"stop\" -> \"stop2\")),\n",
    "            Seq(\"company\", \"num\"))\n",
    "    .join(stops, col(\"stop\")===col(\"id\"))\n",
    "    .join(stops\n",
    "          .withColumnsRenamed(Map(\n",
    "              \"id\" -> \"id2\", \"name\" -> \"name2\")), \n",
    "          col(\"stop2\")===col(\"id2\"))\n",
    "    .filter(col(\"name2\")===\"Lochend\")\n",
    "    .select(\"stop\", \"company\", \"num\")\n",
    "    .dropDuplicates())\n",
    "(bus1.join(bus2\n",
    "           .withColumnsRenamed(Map(\n",
    "               \"company\" -> \"company2\", \"num\" -> \"num2\")), \n",
    "           bus1(\"stop2\")===bus2(\"stop\"))\n",
    "    .select(\"num\", \"company\", \"name2\", \"num2\", \"company2\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".sc",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
